{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import print_function, absolute_import, division, unicode_literals, with_statement\n",
    "from numpy.random import multivariate_normal\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score\n",
    "import numpy as np\n",
    "\n",
    "import cleanlab\n",
    "from cleanlab.noise_generation import generate_noise_matrix_from_trace, generate_noisy_labels\n",
    "from cleanlab.util import print_noise_matrix\n",
    "from cleanlab.latent_estimation import estimate_confident_joint_and_cv_pred_proba, estimate_latent\n",
    "from cleanlab.pruning import get_noise_indices\n",
    "from cleanlab.classification import LearningWithNoisyLabels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def show_decision_boundary(clf, title, show_noise = True):\n",
    "    # Plot the decision boundary. For that, we will assign a color to each\n",
    "    # point in the mesh [x_min, x_max]x[y_min, y_max].\n",
    "    h = .01  # step size in the mesh\n",
    "    x_min, x_max = X_train[:, 0].min() - .5, X_train[:, 0].max() + .5\n",
    "    y_min, y_max = X_train[:, 1].min() - .5, X_train[:, 1].max() + .5\n",
    "    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
    "    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n",
    "\n",
    "\n",
    "    _ = plt.figure(figsize=(15, 12))\n",
    "    plt.axis('off')\n",
    "\n",
    "    # Put the result into a color plot\n",
    "    Z = Z.reshape(xx.shape)\n",
    "    plt.pcolormesh(xx, yy, Z, alpha=0.015)\n",
    "\n",
    "    # Plot the distribution for viewing.\n",
    "    if show_noise:\n",
    "        # Plot noisy labels are circles around label errors\n",
    "        for k in range(K):\n",
    "            X_k = X_train[y_train == k] # data for class k\n",
    "            _ = plt.scatter(X_k[:,0], X_k[:, 1], color=[color_map[noisy_k] for noisy_k in s[y_train==k]], s=150, marker=r\"${a}$\".format(a=str(k)), linewidth=1)\n",
    "        _ = plt.scatter(X_train[:,0][s != y_train], X_train[:,1][s != y_train], s=400, facecolors='none', edgecolors='black', linewidth=0.8)\n",
    "    else:\n",
    "        # Plot the actual labels.\n",
    "        for k in range(K):\n",
    "            X_k = X_train[y_train==k] # data for class k\n",
    "            plt.scatter(X_k[:,0], X_k[:, 1], color=colors[k], s=150, marker=r\"${a}$\".format(a=str(k)), linewidth=1)\n",
    "    plt.title(title, fontsize=25)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "seed = 1 # Seeded for reproducibility - remove to created random noise and distributions.\n",
    "np.random.seed(seed = seed)\n",
    "\n",
    "means = [ [3, 2], [7, 7], [0, 8] ]\n",
    "covs = [ [[5, -1.5],[-1.5, 1]] , [[1, 0.5],[0.5, 4]], [[5, 1],[1, 5]] ]\n",
    "\n",
    "K = len(means) # number of classes\n",
    "sizes = [ 800, 400, 400 ]\n",
    "data = []\n",
    "labels = []\n",
    "test_data = []\n",
    "test_labels = []\n",
    "\n",
    "for idx in range(len(means)):\n",
    "    data.append(multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx]))\n",
    "    test_data.append(multivariate_normal(mean=means[idx], cov=covs[idx], size=sizes[idx]))\n",
    "    labels.append(np.array([idx for i in range(sizes[idx])]))\n",
    "    test_labels.append(np.array([idx for i in range(sizes[idx])]))\n",
    "X_train = np.vstack(data)\n",
    "y_train = np.hstack(labels)\n",
    "X_test = np.vstack(test_data)\n",
    "y_test = np.hstack(test_labels) \n",
    "\n",
    "# Compute p(y=k)\n",
    "py = np.bincount(y_train) / float(len(y_train))\n",
    "\n",
    "noise_matrix = generate_noise_matrix_from_trace(\n",
    "  K, \n",
    "  trace=1.5,\n",
    "  py=py,\n",
    "  valid_noise_matrix=True,\n",
    ")\n",
    "\n",
    "# Generate our noisy labels using the noise_marix.\n",
    "s = generate_noisy_labels(y_train, noise_matrix)\n",
    "ps = np.bincount(s) / float(len(s))\n",
    "\n",
    "confident_joint, psx = estimate_confident_joint_and_cv_pred_proba(X_train, s, seed=seed)\n",
    "est_py, est_noise_matrix, est_inverse_noise_matrix = estimate_latent(confident_joint, s)\n",
    "idx_errors = get_noise_indices(s, psx)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### To show off the power of **cleanlab**, we've chosen an example of multiclass learning with noisy labels in which over 50% of the training labels are wrong.\n",
    "Toggle the ```trace``` parameter in ```generate_noise_matrix_from_trace``` above to try out different amounts of noise. Note, as we prove in our paper, learning becomes impossible if the ```trace <= 1```, so choose a value greater than 1, but less than, or equal to, the number of classes (3)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Plotting is only supported in an iPython interface.\n",
      "The actual, latent, underlying noise matrix.\n",
      "\n",
      " Noise Matrix (aka Noisy Channel) P(s|y) of shape (3, 3)\n",
      " p(s|y)\ty=0\ty=1\ty=2\n",
      "\t---\t---\t---\n",
      "s=0 |\t0.4\t0.21\t0.47\n",
      "s=1 |\t0.25\t0.63\t0.07\n",
      "s=2 |\t0.35\t0.15\t0.47\n",
      "\tTrace(matrix) = 1.5\n",
      "\n",
      "Our estimate of the noise matrix.\n",
      "\n",
      " Noise Matrix (aka Noisy Channel) P(s|y) of shape (3, 3)\n",
      " p(s|y)\ty=0\ty=1\ty=2\n",
      "\t---\t---\t---\n",
      "s=0 |\t0.42\t0.27\t0.46\n",
      "s=1 |\t0.19\t0.52\t0.06\n",
      "s=2 |\t0.39\t0.21\t0.47\n",
      "\tTrace(matrix) = 1.42\n",
      "\n",
      "\n",
      "The actual, latent, underlying joint distribution matrix.\n",
      "\n",
      " Joint Label Noise Distribution Matrix P(s,y) of shape (3, 3)\n",
      " p(s,y)\ty=0\ty=1\ty=2\n",
      "\t---\t---\t---\n",
      "s=0 |\t0.2\t0.05\t0.12\n",
      "s=1 |\t0.12\t0.16\t0.02\n",
      "s=2 |\t0.18\t0.04\t0.12\n",
      "\tTrace(matrix) = 0.48\n",
      "\n",
      "Our estimate of the joint distribution matrix.\n",
      "\n",
      " Joint Label Noise Distribution Matrix P(s,y) of shape (3, 3)\n",
      " p(s,y)\ty=0\ty=1\ty=2\n",
      "\t---\t---\t---\n",
      "s=0 |\t0.24\t0.1\t0.03\n",
      "s=1 |\t0.11\t0.19\t0.0\n",
      "s=2 |\t0.22\t0.08\t0.03\n",
      "\tTrace(matrix) = 0.46\n",
      "\n",
      "Accuracy Comparison\n",
      "-------------------\n",
      "Logistic regression: 0.69875\n",
      "Logistic regression (+rankpruning): 0.74\n",
      "Fit on denoised data without re-weighting: 0.7475\n",
      "Plotting is only supported in an iPython interface.\n"
     ]
    }
   ],
   "source": [
    "est_joint = cleanlab.latent_estimation.estimate_joint(\n",
    "    s=s,\n",
    "    psx=psx,\n",
    "    confident_joint=confident_joint, \n",
    ")\n",
    "true_joint_distribution_of_label_errors = (noise_matrix * py)\n",
    "percent_error_str = 'Percent of training examples that have wrong labels: ' + \\\n",
    "      str(int(round(100 - 100*true_joint_distribution_of_label_errors.trace()))) + \"%\"\n",
    "\n",
    "colors = [(31 / 255., 119 / 255., 180 / 255.), (255 / 255., 127 / 255., 14 / 255.), (44 / 255., 160 / 255., 44 / 255.)]\n",
    "color_map = dict(zip(range(len(colors)), colors))\n",
    "try:\n",
    "# Plot the distribution for your viewing.\n",
    "    % matplotlib inline\n",
    "    from matplotlib import pyplot as plt\n",
    "    _ = plt.figure(figsize=(15, 12))\n",
    "    _ = plt.axis('off')\n",
    "    for k in range(K):\n",
    "        X_k = X_train[y_train==k] # data for class k\n",
    "        _ = plt.scatter(X_k[:,0], X_k[:, 1], color=colors[k], s=150, marker=r\"${a}$\".format(a=str(k)), linewidth=1)\n",
    "    _ = plt.title(\"Original (unobserved) distribution, without any label errors.\", fontsize=30)\n",
    "\n",
    "    print(\"\\n\\n\\n\\n\")\n",
    "\n",
    "    # Plot the noisy distribution for viewing.\n",
    "    _ = plt.figure(figsize=(15, 12))\n",
    "    _ = plt.axis('off')\n",
    "    for k in range(K):\n",
    "        X_k = X_train[y_train == k] # data for class k\n",
    "        _ = plt.scatter(X_k[:,0], X_k[:, 1], color=[color_map[noisy_k] for noisy_k in s[y_train==k]], s=150, marker=r\"${a}$\".format(a=str(k)), linewidth=1)\n",
    "    _ = plt.scatter(X_train[:,0][s != y_train], X_train[:,1][s != y_train], s=400, facecolors='none', edgecolors='black', linewidth=2, alpha = 0.5)\n",
    "    _ = plt.title('Observed distribution, with label errors circled.\\nColors are the given labels, the numbers are latent.\\n'+percent_error_str, fontsize=30)\n",
    "    plt.show()\n",
    "\n",
    "    print(\"\\n\\n\\n\\n\")\n",
    "\n",
    "    # Plot the noisy distribution for viewing.\n",
    "    _ = plt.figure(figsize=(15, 12))\n",
    "    _ = plt.axis('off')\n",
    "    for k in range(K):\n",
    "        X_k = X_train[idx_errors & (y_train == k)] # data for class k\n",
    "        _ = plt.scatter(X_k[:,0], X_k[:, 1], color=[color_map[noisy_k] for noisy_k in s[y_train==k]], s=150, marker=r\"${a}$\".format(a=str(k)), linewidth=1)\n",
    "    _ = plt.scatter(X_train[:,0][s != y_train], X_train[:,1][s != y_train], s=400, facecolors='none', edgecolors='black', linewidth=2, alpha = 0.5)\n",
    "    _ = plt.title('Label errors detected using confident learning.\\nEmpty circles show undetected label errors.\\nUncircled data depicts false positives.', fontsize=30)\n",
    "    plt.show()\n",
    "\n",
    "\n",
    "    print(\"\\n\\n\\n\\n\")\n",
    "\n",
    "    _ = plt.figure(figsize=(15, 12))\n",
    "    _ = plt.axis('off')\n",
    "    for k in range(K):\n",
    "        X_k = X_train[~idx_errors & (y_train == k)] # data for class k\n",
    "        _ = plt.scatter(X_k[:,0], X_k[:, 1], color=[color_map[noisy_k] for noisy_k in s[y_train==k]], s=150, marker=r\"${a}$\".format(a=str(k)), linewidth=1)\n",
    "    _ = plt.scatter(X_train[~idx_errors][:,0][s[~idx_errors] != y_train[~idx_errors]], X_train[~idx_errors][:,1][s[~idx_errors] != y_train[~idx_errors]], s=400, facecolors='none', edgecolors='black', linewidth=2, alpha = 0.5)\n",
    "    _ = plt.title('Dataset after pruning detected label errors.', fontsize=30)\n",
    "    plt.show()\n",
    "except:\n",
    "    print(\"Plotting is only supported in an iPython interface.\")\n",
    "\n",
    "print('The actual, latent, underlying noise matrix.')\n",
    "print_noise_matrix(noise_matrix)\n",
    "print('Our estimate of the noise matrix.')\n",
    "print_noise_matrix(est_noise_matrix)\n",
    "print()\n",
    "print('The actual, latent, underlying joint distribution matrix.')\n",
    "cleanlab.util.print_joint_matrix(true_joint_distribution_of_label_errors)\n",
    "print('Our estimate of the joint distribution matrix.')\n",
    "cleanlab.util.print_joint_matrix(est_joint)\n",
    "print(\"Accuracy Comparison\")\n",
    "print(\"-------------------\")\n",
    "clf = LogisticRegression(solver = 'lbfgs', multi_class = 'auto')\n",
    "baseline_score = accuracy_score(y_test, clf.fit(X_train, s).predict(X_test))\n",
    "print(\"Logistic regression:\", baseline_score)\n",
    "rp = LearningWithNoisyLabels(seed = seed)\n",
    "rp_score = accuracy_score(y_test, rp.fit(X_train, s, psx=psx).predict(X_test))\n",
    "print(\"Logistic regression (+rankpruning):\", rp_score)\n",
    "diff = rp_score - baseline_score\n",
    "clf = LogisticRegression(solver = 'lbfgs', multi_class = 'auto')\n",
    "print('Fit on denoised data without re-weighting:', accuracy_score(y_test, clf.fit(X_train[~idx_errors], s[~idx_errors]).predict(X_test)))\n",
    "\n",
    "\n",
    "\n",
    "try:\n",
    "    % matplotlib inline\n",
    "    from matplotlib import pyplot as plt\n",
    "    \n",
    "    print(\"\\n\\n\\n\\n\\n\\n\")\n",
    "    \n",
    "    clf = LogisticRegression(solver = 'lbfgs', multi_class = 'auto')\n",
    "    _ = clf.fit(X_train, s)\n",
    "    show_decision_boundary(clf, 'Decision boundary for logistic regression trained with noisy labels.\\n Test Accuracy: ' + str(round(baseline_score, 3)))\n",
    "\n",
    "    _ = clf.fit(X_train, y_train)\n",
    "    max_score = accuracy_score(y_test, clf.predict(X_test))\n",
    "    show_decision_boundary(clf, 'Decision boundary for logistic regression trained with no label errors.\\n Test Accuracy: ' + str(round(max_score, 3)), show_noise = False)\n",
    "\n",
    "    show_decision_boundary(rp.clf, 'Decision boundary for LogisticRegression (+rankpruning) trained with noisy labels.\\n Test Accuracy: ' + str(round(rp_score, 3)))\n",
    "except:\n",
    "    print(\"Plotting is only supported in an iPython interface.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Param settings: {'prune_method': 'prune_by_noise_rate', 'converge_latent_estimates': True}\n",
      "Accuracy (using confident learning):\t 0.75 \n",
      "\n",
      "The actual, latent, underlying noise matrix:\n",
      " Noise Matrix (aka Noisy Channel) P(s|y) of shape (3, 3)\n",
      " p(s|y)\ty=0\ty=1\ty=2\n",
      "\t---\t---\t---\n",
      "s=0 |\t0.4\t0.21\t0.47\n",
      "s=1 |\t0.25\t0.63\t0.07\n",
      "s=2 |\t0.35\t0.15\t0.47\n",
      "\tTrace(matrix) = 1.5\n",
      "\n",
      "LearningWithNoisyLabels best estimate of the noise matrix:\n",
      " Noise Matrix (aka Noisy Channel) P(s|y) of shape (3, 3)\n",
      " p(s|y)\ty=0\ty=1\ty=2\n",
      "\t---\t---\t---\n",
      "s=0 |\t0.42\t0.27\t0.5\n",
      "s=1 |\t0.19\t0.52\t0.07\n",
      "s=2 |\t0.39\t0.21\t0.44\n",
      "\tTrace(matrix) = 1.38\n",
      "\n",
      "Param settings: {'prune_method': 'prune_by_noise_rate', 'converge_latent_estimates': False}\n",
      "Accuracy (using confident learning):\t 0.74 \n",
      "\n",
      "Param settings: {'prune_method': 'both', 'converge_latent_estimates': True}\n",
      "Accuracy (using confident learning):\t 0.72 \n",
      "\n",
      "Param settings: {'prune_method': 'both', 'converge_latent_estimates': False}\n",
      "Accuracy (using confident learning):\t 0.71 \n",
      "\n",
      "Param settings: {'prune_method': 'prune_by_class', 'converge_latent_estimates': True}\n",
      "Accuracy (using confident learning):\t 0.67 \n",
      "\n",
      "Param settings: {'prune_method': 'prune_by_class', 'converge_latent_estimates': False}\n",
      "Accuracy (using confident learning):\t 0.67 \n",
      "\n",
      "The actual, latent, underlying noise matrix:\n",
      " Noise Matrix (aka Noisy Channel) P(s|y) of shape (3, 3)\n",
      " p(s|y)\ty=0\ty=1\ty=2\n",
      "\t---\t---\t---\n",
      "s=0 |\t0.4\t0.21\t0.47\n",
      "s=1 |\t0.25\t0.63\t0.07\n",
      "s=2 |\t0.35\t0.15\t0.47\n",
      "\tTrace(matrix) = 1.5\n",
      "\n",
      "LearningWithNoisyLabels best estimate of the noise matrix:\n",
      " Noise Matrix (aka Noisy Channel) P(s|y) of shape (3, 3)\n",
      " p(s|y)\ty=0\ty=1\ty=2\n",
      "\t---\t---\t---\n",
      "s=0 |\t0.42\t0.27\t0.49\n",
      "s=1 |\t0.19\t0.52\t0.05\n",
      "s=2 |\t0.39\t0.21\t0.46\n",
      "\tTrace(matrix) = 1.4\n",
      "\n"
     ]
    }
   ],
   "source": [
    "param_grid = {\n",
    "    \"prune_method\": [\"prune_by_noise_rate\", \"prune_by_class\", \"both\"],\n",
    "    \"converge_latent_estimates\": [True, False],\n",
    "}\n",
    "\n",
    "# Fit LearningWithNoisyLabels across all parameter settings.\n",
    "from sklearn.model_selection import ParameterGrid\n",
    "params = ParameterGrid(param_grid)\n",
    "scores = []\n",
    "for param in params:\n",
    "    clf = LogisticRegression(solver = 'lbfgs', multi_class = 'auto')\n",
    "    rp = LearningWithNoisyLabels(clf = clf, **param)\n",
    "    _ = rp.fit(X_train, s) # s is the noisy y_train labels\n",
    "    scores.append(accuracy_score(rp.predict(X_test), y_test))\n",
    "\n",
    "# Print results sorted from best to least\n",
    "for i in np.argsort(scores)[::-1]:\n",
    "    print(\"Param settings:\", params[i])\n",
    "    print(\n",
    "        \"Accuracy (using confident learning):\\t\", \n",
    "        round(scores[i], 2),\n",
    "        \"\\n\"\n",
    "    )\n",
    "    \n",
    "    # Print noise matrix for highest/lowest scoring models\n",
    "    if i == np.argmax(scores) or i == np.argmin(scores):\n",
    "        # Retrain with best parameters and show noise matrix estimation\n",
    "        clf = LogisticRegression(solver = 'lbfgs', multi_class = 'auto')\n",
    "        rp = LearningWithNoisyLabels(clf = clf, **param)\n",
    "        _ = rp.fit(X_train, s) # s is the noisy y_train labels\n",
    "        print('The actual, latent, underlying noise matrix:', end = \"\")\n",
    "        print_noise_matrix(noise_matrix)\n",
    "        print('LearningWithNoisyLabels best estimate of the noise matrix:', end = \"\")\n",
    "        print_noise_matrix(rp.noise_matrix)\n",
    "        "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### In the above example, notice the robustness to hyper-parameter choice and the stability of the algorithms across parameters. No setting of parameters dramatically affects the results. In fact, in certain non-trivial cases, we can prove that certain settings of parameters are equivalent.\n",
    "\n",
    "### In summary, the default setting of parameters tends to work well, but optimize across their settings freely."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0.2425   0.095625 0.031875]\n",
      " [0.10875  0.18625  0.004375]\n",
      " [0.223125 0.075    0.0325  ]]\n",
      "\n",
      "The above output should look like this:\n",
      "[[0.2425   0.095625 0.031875]\n",
      " [0.10875  0.18625  0.004375]\n",
      " [0.223125 0.075    0.0325  ]]\n"
     ]
    }
   ],
   "source": [
    "joint = cleanlab.latent_estimation.estimate_joint(s, psx, confident_joint)\n",
    "print(joint)\n",
    "\n",
    "print('\\nThe above output should look like this:')\n",
    "print(confident_joint / len(s)) "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
